Load packages
library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill = NA),
text = element_text(size = 20),
strip.placement = "outside",
strip.text = element_text(size=20, color = "black"),
strip.background = element_rect(fill = "white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(DESeq2)
library(consensusSeekeR)
library(BSgenome.jaNemVect1.1.DToL.Assembly)
library(GenomicRanges)
library(parallel)
library(universalmotif)
library(monaLisa)
library(rtracklayer)
library(ComplexUpset)
source("../motif-analysis/mta_downstream_functions.R")
source("scripts/functions.R")
Directories and colors
dat_dir <- "ATACSEQ/nucleosome_free_regions/"
pks_dir <- file.path(dat_dir, "macs2_peaks")
cns_dir <- file.path(dat_dir, "consensus_peaks")
hom_dir <- file.path(dat_dir, "homer")
ftp_dir <- file.path(dat_dir, "footprint")
bind_dir <- file.path(dat_dir, "footprint", "BINDetect")
res_dir <- file.path(dat_dir, "results")
fig_dir <- file.path(dat_dir, "plots")
for (newdir in c(cns_dir, hom_dir, res_dir, fig_dir))
dir.create(newdir, showWarnings = FALSE)
# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols <- c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")
line_cond_cols <- c(
"Elav_pos" = "#ff7f00", "Fox_pos" = "#984ea3", "Ncol_pos" = "#4daf4a",
"Elav_neg" = "#ebbd8f", "Fox_neg" = "#d18adb", "Ncol_neg" = "#90d18e"
)
Gene annotations
marks_gold <- fread(
file.path("annotation", "golden-marks-231124.tsv"),
fill = TRUE, sep = "\t", header = FALSE
)[, c(2:1)]
setnames(marks_gold, c("gene", "name"))
marks_gold_v <- structure(marks_gold$name, names = marks_gold$gene)
tfs_annt <- fread(
file.path("annotation", "tfs.Nvec.tsv"),
header = TRUE
)[, .SD[1], gene]
setnames(tfs_annt, c("gene", "name", "pfam"))
gene_annt <- fread(
file.path("annotation", "Nvec_annotation_v3_2020-10-23_DToL_names"),
select = 1:3
)
setnames(gene_annt, c("gene", "pfam", "name"))
gene_ann <- rbindlist(list(
gene_annt[!gene %in% tfs_annt$gene],
tfs_annt
), use.names = TRUE)
gene_ann[gene %in% marks_gold$gene, name := marks_gold_v[gene]]
Find consensus set of peaks
require(consensusSeekeR)
require(BSgenome.jaNemVect1.1.DToL.Assembly)
require(parallel)
# load peaks
pks_files <- list.files(
pks_dir,
pattern = "narrowPeak",
recursive = FALSE,
full.names = TRUE
)
names(pks_files) <- str_remove(
basename(pks_files),
".mLb.clN.ncfree_peaks.narrowPeak"
)
nP_list <- lapply(names(pks_files), function(x) {
nP <- readNarrowPeakFile(
pks_files[x],
extractRegions = TRUE,
extractPeaks = TRUE
)
names(nP$narrowPeak) <- rep(x, length(nP$narrowPeak))
names(nP$peak) <- rep(x, length(nP$peak))
nP
})
regions <- GenomicRanges::GRangesList(lapply(nP_list, function(x) x$narrowPeak))
peaks <- GenomicRanges::GRangesList(lapply(nP_list, function(x) x$peak))
names(regions) <- names(pks_files)
names(peaks) <- names(pks_files)
# get consensus
chrList <- Seqinfo(
seqnames = seqnames(BSgenome.jaNemVect1.1.DToL.Assembly),
seqlengths = seqlengths(BSgenome.jaNemVect1.1.DToL.Assembly),
isCircular = c(
rep(FALSE, length(seqnames(BSgenome.jaNemVect1.1.DToL.Assembly)) - 1),
TRUE
),
genome = "jaNemVect1.1"
)
message(Sys.time(), " Started calculating consensus")
ur <- unlist(regions)
up <- unlist(peaks)
results <- findConsensusPeakRegions(
narrowPeaks = ur,
peaks = up,
chrInfo = chrList,
extendingSize = 250,
expandToFitPeakRegion = FALSE,
shrinkToFitPeakRegion = FALSE,
minNbrExp = 2,
nbrThreads = detectCores() - 1
)
message(Sys.time(), " Done calculating consensus")
saveRDS(results, file.path(cns_dir, "consensusSeekeR-results.RDS"))
saveRDS(results, file.path(cns_dir, "consensusSeekeR-results.RDS"))
# resize peaks
pks_cns <- results$consensusRanges
pks_mid <- start(pks_cns) + (end(pks_cns) - start(pks_cns)) / 2
ranges(pks_cns) <- IRanges(pks_mid, width = 0)
pks_scl <- promoters(pks_cns, upstream = 125, downstream = 125)
# trim out-of-bound peaks
seqlengths(pks_scl) <- seqlengths(
BSgenome.jaNemVect1.1.DToL.Assembly
)[names(seqlengths(pks_scl))]
pks_scl <- trim(pks_scl)
# save bed file
pks_bed <- as.data.table(pks_scl)
pks_bed[, name := paste0("peak", seq_len(.N))]
pks_bed[, width := NULL][, score := "."]
setcolorder(pks_bed, c("seqnames", "start", "end", "name", "score", "strand"))
fwrite(
pks_bed,
file.path(cns_dir, "consensusSeekeR-peaks.bed"),
sep = "\t",
col.names = FALSE
)
Get scores for consensus peaks in all samples.
nth=12
res="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks-counts.tsv"
bed="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
bam=$( echo ATACSEQ/nucleosome_free_regions/bam/*R*bam )
cols=$( echo -e seqnames start end name score strand | column -t )
for b in $bam
do
echo $b
name=$( basename $b)
cols=$( echo -e ${cols} ${name} | column -t )
done
echo -e ${cols} > ${res%%tsv}txt
bedtools multicov -bams ${bam} -bed ${bed} > ${res}
Data for differential peaks analysis
# load counts in peaks
pks_ct <- fread(
file.path(cns_dir, "consensusSeekeR-peaks-counts.tsv"),
header = FALSE
)
colnames <- readLines(
file.path(cns_dir, "consensusSeekeR-peaks-counts.txt"),
n = 1
)
colnames <- str_remove_all(colnames, ".mLb.clN.ncfree.sorted.bam")
colnames <- str_split(colnames, " ")[[1]]
colnames(pks_ct) <- colnames
# column data
col_dt <- data.table(sample = colnames(pks_ct)[7:28])
col_dt[, reporterline := str_extract(sample, "Elav|Fox|Ncol")]
col_dt[, reporterline := factor(reporterline, levels=c("Elav", "Fox", "Ncol"))]
col_dt[, condition := str_extract(sample, "pos|neg")]
col_dt[, condition := str_replace_all(
condition,
c("pos" = "positive", "neg" = "negative")
)]
col_dt[, condition := factor(condition, levels = c("negative", "positive"))]
col_dt[, group := paste(
as.character(reporterline), as.character(condition), sep = ""
), by = seq_len(nrow(col_dt))]
col_dt[, group := factor(group)]
fwrite(col_dt, file.path(res_dir, "design.tsv"), sep = "\t")
col_df <- copy(col_dt)
class(col_df) <- "data.frame"
rownames(col_df) <- col_df$sample
# counts matrix
pks_mt <- as.matrix(pks_ct[, -c(1:6)])
rownames(pks_mt) <- pks_ct$name
pks_mt <- pks_mt[, rownames(col_df)]
write.table(
pks_mt,
file.path(res_dir, "mat.tsv"),
sep = "\t",
quote = FALSE
)
# DESeq2
require(DESeq2)
dds <- DESeqDataSetFromMatrix(
countData = pks_mt,
colData = col_df,
design = ~ condition + reporterline
)
saveRDS(dds, file.path(res_dir, "dds.rds"))
Peaks counts distributions
# normalized accessibility distribution
pks_dt <- as.data.table(pks_mt, keep.rownames = "peak")
pks_dt <- melt.data.table(
pks_dt,
id.vars = "peak",
variable.name = "sample",
value.name = "norm_counts"
)
pks_dt <- merge.data.table(pks_dt, col_dt, by = "sample")
setorder(pks_dt, peak, condition, reporterline)
pks_dt[, sample := factor(sample, levels = unique(pks_dt$sample))]
gp_acc <- ggplot(pks_dt, aes(sample, log10(norm_counts), fill = reporterline)) +
geom_violin(scale = "width", alpha = 0.8, color = "black") +
geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.8, color = "black") +
scale_fill_manual(values = line_cols, limits = force) +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
labs(x = "samples", y = "peak\naccessibility") +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.title = element_blank(), legend.position = "none"
)
var_dt <- pks_dt[, .(var = var(norm_counts)), peak]
gp_var <- ggplot(var_dt, aes(log10(var))) +
geom_density() +
scale_x_continuous(limits = c(-4, NA)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
labs(x = "log10(accessibility variance)")
gp_pch <- gp_acc / gp_var + plot_layout(heights = c(2, 1))
gp_pch
Use normalized log-transformed accessibility data
# normalize samples
dds <- readRDS(file.path(res_dir, "dds.rds"))
dds <- estimateSizeFactors(dds)
norm_mt <- counts(dds, normalized = TRUE)
# save
write.table(
norm_mt,
file.path(res_dir, "mat_norm.tsv"),
sep = "\t",
row.names = TRUE,
quote = FALSE
)
# row normalize to bring peaks to same range
norm_mt <- (norm_mt + 10) / apply(norm_mt + 10, 1, median)
norm_mt <- log2(norm_mt)
# save
write.table(
norm_mt,
file.path(res_dir, "mat_norm_fc.tsv"),
sep = "\t",
row.names = TRUE,
quote = FALSE
)
PCA on all samples.
set.seed(1950)
pca_res <- prcomp(t(norm_mt), center = TRUE)
# variance explained
pca_var <- data.table(
pct_var = round(((pca_res$sdev) ^ 2 / sum((pca_res$sdev) ^ 2)* 100), 2)
)
pca_var[,pct_cum:=cumsum(pct_var)]
pca_var[,PC:=factor(1:.N)]
gp_var <- ggplot(pca_var, aes(PC, pct_var)) +
geom_bar(stat = "identity") +
geom_line(aes(y = pct_cum, group = 1)) +
geom_point(aes(y = pct_cum)) +
scale_y_continuous(expand = expansion(0.01,0)) +
labs(y = "% of variance\nexplained", x = "PC") +
theme(panel.grid.major.y = element_line(size = 0.5))
pca_dt <- as.data.table(pca_res$x, keep.rownames = "sample")
pca_dt <- merge.data.table(col_dt, pca_dt, by="sample", sort=FALSE)
gp_bip <- ggplot(pca_dt, aes(PC1, PC2, fill=reporterline, shape=condition)) +
geom_point(size = 5) +
scale_fill_manual(values = line_cols) +
scale_shape_manual(values = c("positive" = 21, "negative" = 24)) +
guides(fill = guide_legend(override.aes=list(shape=21))) +
geom_text_repel(aes(label = sample))
gp_pch <- gp_var / gp_bip
gp_pch
PCA per cell line
set.seed(1950)
gp_l <- lapply(names(line_cols), function(cl) {
pca_res <- prcomp(t(norm_mt[,grep(cl,colnames(norm_mt))]), center = TRUE)
# variance explained
pca_var <- data.table(
pct_var = round(((pca_res$sdev) ^ 2 / sum((pca_res$sdev) ^ 2)* 100), 2)
)
pca_var <- pca_var[-nrow(pca_var)]
pca_var[,pct_cum:=cumsum(pct_var)]
pca_var[,PC:=factor(1:.N - 1)]
gp_var <- ggplot(pca_var, aes(PC, pct_var)) +
geom_bar(stat = "identity") +
geom_line(aes(y = pct_cum, group = 1)) +
geom_point(aes(y = pct_cum)) +
scale_y_continuous(expand = expansion(0.01,0)) +
labs(y = "% of variance\nexplained", x = "PC") +
theme(panel.grid.major.y = element_line(size = 0.5))
# pca plot
pca_dt <- as.data.table(pca_res$x, keep.rownames = "sample")
pca_dt <- merge.data.table(col_dt, pca_dt, by="sample", sort=FALSE)
gp_bip <- ggplot(pca_dt, aes(PC1, PC2, fill=condition, shape=condition)) +
geom_point(size=5) +
scale_shape_manual(values = c("positive" = 21, "negative" = 24)) +
scale_fill_manual(values = condition_cols) +
guides(fill = guide_legend(override.aes=list(shape=21))) +
geom_text_repel(aes(label = sample))
gp_var / gp_bip
})
gp_l
## [[1]]
##
## [[2]]
##
## [[3]]
Use normalized peak counts
norm_mt <- read.table(
file.path(res_dir, "mat_norm_fc.tsv"),
header = TRUE
)
Identify marker peaks
# gene markers by high FC + significant DEseq2 LTR test
dds <- readRDS(file.path(res_dir, "dds.rds"))
dds <- DESeq(dds, test="LRT", reduced=~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_res <- results(dds)
dds_qval <- dds_res$padj
names(dds_qval) <- rownames(dds_res)
peaks_deseq <- names(which(dds_qval<1e-2))
peaks_high <- names(which(apply(
norm_mt, 1, function(x) sort(x, decreasing = TRUE)[2] >= 1.8
)))
peaks_marks <- intersect(peaks_high, peaks_deseq)
length(peaks_marks)
## [1] 2844
Cluster peaks
set.seed(1950)
# determine k for kmeans
ks <- 1:30
tot_withinss <- sapply(ks, function(k) {
cl <- kmeans(norm_mt[peaks_marks,], k)
cl$tot.withinss
})
elbow_dt <- data.table(k = ks, tot_withinss = tot_withinss)
elbow_gp <- ggplot(elbow_dt, aes(x = k, y = tot_withinss)) +
geom_line() + geom_point()+
scale_x_continuous(breaks = ks)
elbow_gp
Cluster peaks
# kmeans
set.seed(1950)
k <- 19
cl <- kmeans(norm_mt[peaks_marks,], k)
peaks_order_list <- tapply(names(cl$cluster), cl$cluster, function(gs) {
cor_peaks <- cor(t(norm_mt[gs,]))
hclust_peaks <- hclust(as.dist(
1 - cor(cor_peaks)),
method = "ward.D2"
)
rownames(cor_peaks)[hclust_peaks$order]
})
names(peaks_order_list) <- unique(cl$cluster)
peaks_order_list <- peaks_order_list[as.character(seq_along(peaks_order_list))]
# cluster clusters
cluster_order <- hclust(dist(
cor(t(cl$centers)), method = "euclidean"
), method = "ward.D2")$order
peaks_order_list <- peaks_order_list[as.character(cluster_order)]
peaks_order <- unname(unlist(peaks_order_list))
clusters_dt <- data.table(
peak = peaks_order,
clusters = as.character(rep(
names(peaks_order_list),
sapply(peaks_order_list, length)
))
)
# group clusters (manually)
clusters_dt[clusters %in% c(16), group :=1 ] # Elav
clusters_dt[clusters %in% c(5), group :=2 ] # Elav + Fox
clusters_dt[clusters %in% c(14,17), group := 3] # Fox
clusters_dt[clusters %in% c(6,11), group := 4] # Fox + Ncol
clusters_dt[clusters %in% c(9), group := 5]
clusters_dt[clusters %in% c(13,4), group := 6] # Elav + Ncol
clusters_dt[clusters %in% c(1,2,15,7,8,10,19,18), group := 7] # Ncol
clusters_dt[clusters %in% c(12), group := 8]
clusters_dt[clusters %in% c(3), group := 9]
setorder(clusters_dt, group)
peaks_order <- clusters_dt$peak
# add clusters info to peaks coordinates bed file
pks_bed <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(pks_bed, c("V4"), c("peak"))
clusters_bed <- merge.data.table(
pks_bed, clusters_dt,
by = "peak",
sort = FALSE
)
setcolorder(clusters_bed, c(colnames(pks_bed)))
clusters_bed[, clusters := paste0("C", clusters)]
clusters_bed[, group := paste0("G", group)]
fwrite(
clusters_bed,
file.path(res_dir, "consensusSeekeR-peaks-clusters.bed"),
sep = "\t",
col.names = FALSE
)
Heatmap of markers
# order rows and columns
samples_order <- col_dt[order(condition,reporterline)]$sample
plot_mt <- norm_mt[peaks_order,samples_order]
# center at 0
plot_min <- quantile(abs(range(plot_mt)),0.75)
plot_min <- 5
plot_mt <- pmin(pmax(plot_mt,-plot_min),plot_min)
# heatmap colors
col_vec <- colorRampPalette(
RColorBrewer::brewer.pal(11,'BrBG')
)(1000)
col_fun <- circlize::colorRamp2(
seq(-plot_min, plot_min, length.out = length(col_vec)),
col_vec
)
# color annotaitons
col_ann <- HeatmapAnnotation(
which = "column", border = TRUE,
"reporterline" = as.character(
col_dt[match(colnames(plot_mt),sample)]$reporterline
),
"condition" = as.character(
col_dt[match(colnames(plot_mt),sample)]$condition
),
col = list(
"reporterline" = line_cols,
"condition" = condition_cols
)
)
# # peak module annotations (clusters)
# clann <- clusters_dt[match(rownames(plot_mt),peak)]$clusters
# clann_lab <- unique(clusters_dt[match(rownames(plot_mt),peak)]$clusters)
# clann <- factor(clann, levels=clann_lab)
# module_ann <- HeatmapAnnotation(
# which = "row", border = TRUE,
# "cluster" = anno_block(
# labels = clann_lab,
# gp = gpar(col=NA)
# )
# )
# row_split <- clann
# peak module annotations (manually groupped clusters)
grann <- clusters_dt[match(rownames(plot_mt),peak)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt),peak)]$group)
grann <- factor(grann, levels=grann_lab)
module_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"group" = anno_block(
labels = grann_lab,
gp = gpar(col=NA)
)
)
row_split <- grann
# peaks annotations
row_labels_marks_ids <- match(
clusters_dt[,.SD[1],clusters]$peak,
rownames(plot_mt)
)
row_labels_marks <- clusters_dt[,.SD[1], clusters]$clusters
mark_ann <- HeatmapAnnotation(
which = "row",
marker = anno_mark(at = row_labels_marks_ids, labels = row_labels_marks),
show_legend = FALSE
)
mark_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"padj<0.05" = rownames(plot_mt) %in% peaks_deseq,
"var>1" = rownames(plot_mt) %in% peaks_vari,
"FC>2" = rownames(plot_mt) %in% peaks_fc,
col = list(
"padj<0.05" = c("TRUE" = "#e6ab02", "FALSE"="#d9d9d9"),
"var>1" = c("TRUE" = "#3690c0", "FALSE"="#d9d9d9"),
"FC>2" = c("TRUE" = "#e7298a", "FALSE"="#d9d9d9")
)
)
# heatmap
hm <- Heatmap(
plot_mt, name = "normalized\naccessibility",
col = col_fun,
cluster_rows = FALSE, cluster_columns = FALSE,
show_row_names = FALSE, show_column_names = TRUE,
row_title = "peaks",
row_split = row_split,
cluster_row_slices = FALSE,
top_annotation = col_ann,
left_annotation = module_ann, right_annotation = mark_ann,
border = TRUE
)
hm
Load data
# peaks
peaks <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(peaks, c("seqnames", "start", "end", "name", "score", "strand"))
peaks_gr <- makeGRangesFromDataFrame(peaks, keep.extra.columns = TRUE)
# genes
genes_gr <- rtracklayer::import(
"annotation/Nvec_v4_merged_annotation_sort.gtf",
"gtf"
)
genes_gr <- genes_gr[genes_gr$type == "transcript"]
genes_gr$name <- genes_gr$transcript_id
# non-expressed genes to remove
counts <- read.table(
"RNASEQ_QUANTIFICATION/raw_counts_rnaseq.tsv",
header = TRUE,
row.names = 1
)
exclude_genes <- names(which(
apply(counts, 1, function(x) sum(x) < 10)
))
length(exclude_genes) # 437
## [1] 437
Assign peaks to genes
# assign
assign <- mta_match_peaks_to_genes(
gff_object = genes_gr,
peak_object = peaks_gr,
index_object = "genome/Nvec_vc1.1_gDNA.fasta.fai",
list_genes = NULL,
feature_to_match = "transcript",
feature_field = "name",
exclude_genes = NULL,
max_tss_dist = 10000,
min_overlap = 0,
max_gap = 1,
promoter_upstream = 200,
promoter_downstream = 50,
promoter_object = NULL
)
setDT(assign)
setnames(assign, "chr", "seqnames")
assign[, seqnames := factor(seqnames, levels = unique(peaks$seqnames))]
setorder(assign, seqnames, start, end)
# save
fwrite(
assign,
file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"),
sep = "\t",
col.names = TRUE
)
Inspect peak assignment results
assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
max_assign_peaks_per_gene <- unique(
assign[, .(peak, gene)][!is.na(gene)]
)[, .N, gene]
gb1 <- ggplot(max_assign_peaks_per_gene, aes(N)) +
geom_bar() +
labs(x = "peaks per gene", y = "genes") +
scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
scale_x_continuous(
breaks = c(1, seq(5, max(max_assign_peaks_per_gene$N), 5))
)
max_assign_gene_per_peak <- unique(
assign[, .(peak, gene)][!is.na(gene)]
)[, .N, peak]
gb2 <- ggplot(max_assign_gene_per_peak, aes(N)) +
geom_bar() +
labs(x = "genes per peak", y = "peaks") +
scale_y_continuous(
expand = expansion(mult = c(0, 0.01)),
labels = scales::label_number(scale = 0.001, suffix = "K")
) +
scale_x_continuous(breaks = seq(1, max(max_assign_peaks_per_gene$N)))
gb1 + gb2
Calculate gene accessibility scores as weighted sum of normalized peak scores assigned to genes.
assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
setnames(assign, "seqnames", "chr")
genes_gff <- genes_gr#[!genes_gr$name %in% exclude_genes]
genes_gff$symbol <- genes_gff$name
peaks_mt <- read.table(file.path(res_dir, "mat_norm.tsv"))
# cells_groups <- fread(file.path(res_dir, "design.tsv"))[, .(sample, group)]
# setnames(cells_groups, "sample", "cell")
gscore <- mta_gene_scores(
genes_peaks_table = assign,
gff_object = genes_gff,
peak_object = peaks_gr,
peaks_mat = peaks_mt
)
saveRDS(gscore, file.path(res_dir, "consensusSeekeR-gene-scores.rds"))
write.table(
gscore$genes_scores_matrix,
file.path(res_dir, "consensusSeekeR-gene-scores.tsv"),
sep = "\t",
col.names = TRUE
)
Use normalized gene scores
# load gene scores
norm_mt <- read.table(
file.path(res_dir, "consensusSeekeR-gene-scores.tsv"),
header = TRUE
)
# row normalize to bring genes to same range
norm_mt <- (norm_mt + 10) / apply(norm_mt + 10, 1, median)
norm_mt <- log2(norm_mt)
con_mt <- read.table(
file.path(res_dir, "consensusSeekeR-gene-scores-raw.tsv"),
)
# gene markers by high FC + significant DEseq2 LTR test
dds <- DESeqDataSetFromMatrix(
countData = con_mt,
colData = col_df,
design = ~ condition + reporterline + condition:reporterline
)
dds <- DESeq(dds, test = "LRT", reduced = ~ 1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_res <- results(dds)
dds_qval <- dds_res$padj
names(dds_qval) <- rownames(dds_res)
genes_deseq <- names(which(dds_qval < 1e-2))
genes_high <- names(which(apply(norm_mt, 1, function (x)
sort(x, decreasing = TRUE)[1] > 1.8
)))
gene_marks <- intersect(genes_high, genes_deseq)
# inclusde golden markers
marks_gold <- fread(
file.path("annotation", "golden-marks-230116.tsv"),
fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]
gene_marks <- unique(c(gene_marks, marks_gold[gene %in% genes_deseq]$gene))
Select number of clusters for genes
set.seed(1950)
# determine k for kmeans
ks <- 1:30
tot_withinss <- sapply(ks, function(k) {
cl <- kmeans(norm_mt[gene_marks, ], k)
cl$tot.withinss
})
elbow_dt <- data.table(k = ks, tot_withinss = tot_withinss)
elbow_gp <- ggplot(elbow_dt, aes(x = k, y = tot_withinss)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = ks) +
theme(panel.grid.major = element_line(size = 0.5))
elbow_gp
Cluster genes
set.seed(1950)
# kmeans
k <- 18
cl <- kmeans(norm_mt[gene_marks, ], k)
gene_order_list <- tapply(names(cl$cluster), cl$cluster, function(gs) {
cor_genes <- cor(t(norm_mt[gs,]))
hclust_genes <- hclust(as.dist(1 - cor(cor_genes)), method = "ward.D2")
rownames(cor_genes)[hclust_genes$order]
})
names(gene_order_list) <- unique(cl$cluster)
gene_order_list <- gene_order_list[as.character(seq_along(gene_order_list))]
# cluster clusters
cluster_order <- hclust(
dist(cor(t(cl$centers)),
method = "euclidean"),
method = "ward.D2"
)$order
gene_order_list <- gene_order_list[as.character(cluster_order)]
gene_order <- unname(unlist(gene_order_list[cluster_order]))
clusters_dt <- data.table(
gene = unlist(gene_order_list),
clusters = as.character(
rep(names(gene_order_list), sapply(gene_order_list, length))
)
)
# group clusters (manually)
clusters_dt[clusters %in% c(8, 15), group := 1] # Elav
clusters_dt[clusters %in% c(1, 4), group := 2] # Elav + Fox
clusters_dt[clusters %in% c(16, 2), group := 3] # Fox
clusters_dt[clusters %in% c(3), group := 4] # Fox + Ncol
clusters_dt[clusters %in% c(13), group := 5] # Fox + Ncol + Elav
clusters_dt[clusters %in% c(17, 5), group := 6] # Elav + Ncol
clusters_dt[clusters %in% c(7, 14, 6, 10, 11, 9), group := 7] # Ncol
clusters_dt[clusters %in% c(12), group := 8]
clusters_dt[clusters %in% c(18), group := 9]
setorder(clusters_dt, group)
gene_order <- clusters_dt$gene
# save
fwrite(
clusters_dt,
file.path(res_dir, "genes_clusters.tsv"),
sep = "\t"
)
Heatmap of markers
# order rows and columns
col_dt <- fread(file.path(res_dir, "design.tsv"))
samples_order <- col_dt[order(condition, reporterline)]$sample
plot_mt <- norm_mt[gene_order, samples_order]
# center at 0
plot_min <- quantile(abs(range(plot_mt)), 0.75)
plot_min <- 4
plot_mt <- pmin(pmax(plot_mt, -plot_min), plot_min)
# heatmap colors
col_vec <- colorRampPalette(RColorBrewer::brewer.pal(11, 'BrBG'))(1000)
col_fun <- circlize::colorRamp2(
seq(-plot_min, plot_min, length.out = length(col_vec)),
col_vec
)
# color annotations
col_ann <- HeatmapAnnotation(
which = "column",
border = TRUE,
"reporterline" = as.character(
col_dt[match(colnames(plot_mt), sample)]$reporterline
),
"condition" = as.character(
col_dt[match(colnames(plot_mt), sample)]$condition
),
col = list("reporterline" = line_cols, "condition" = condition_cols)
)
# gene module annotations
clann <- clusters_dt[match(rownames(plot_mt), gene)]$clusters
clann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$clusters)
clann <- factor(clann, levels = clann_lab)
module_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"cluster" = anno_block(
labels = clann_lab,
gp = gpar(col = NA)
)
)
row_split <- clann
# gene module annotations (manual groups)
grann <- clusters_dt[match(rownames(plot_mt), gene)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$group)
grann <- factor(grann, levels = grann_lab)
module_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"group" = anno_block(
labels = grann_lab,
gp = gpar(col = NA)
)
)
row_split <- grann
# gene module annotations (manual groups)
grann <- clusters_dt[match(rownames(plot_mt), gene)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$group)
grann <- factor(grann, levels = grann_lab)
module_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"group" = anno_block(
labels = grann_lab,
gp = gpar(col = NA)
)
)
row_split <- grann
# genes annotations
marks_gold <- fread(
file.path("annotation", "golden-marks-230116.tsv"),
fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]
tfs_annt <- fread(
file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))
marks_tfs <- merge.data.table(
tfs_annt,
marks_gold,
by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""
marks_tfs <- marks_tfs[gene %in% rownames(plot_mt)]
marks_tfs[
gene %in% clusters_dt[group==5]$gene,
name := str_replace_all(og, c(
"HMGbox_Sox.HG1.25:like:BHMG1/SOX1/SOX2/SOX3/SOX14/SOX15/SOX21/SRY:likeclu:18/26/28" = "SOX1/2/3/14/15/21-like",
"zf-C2H2.HG5.17:PRDM6" = "zf-C2H2 PRDM6",
"zf-C2H2.Unclassified" = ""
))
]
marks_tfs <- marks_tfs[name != ""]
row_labels_marks_ids <- match(marks_tfs$gene, rownames(plot_mt))
row_labels_marks <- marks_tfs[
match(rownames(plot_mt)[row_labels_marks_ids], gene)
]$name
mark_ann <- HeatmapAnnotation(
which = "row", border = TRUE,
"padj<0.01" = rownames(plot_mt) %in% genes_deseq,
# "var>1" = rownames(plot_mt) %in% genes_vari,
#"FC>2" = rownames(plot_mt) %in% genes_fc,
"marker" = anno_mark(
at = row_labels_marks_ids,
labels = row_labels_marks
),
col = list(
"padj<0.01" = c("TRUE" = "#e6ab02", "FALSE"="#d9d9d9"),
"var>1" = c("TRUE" = "#3690c0", "FALSE"="#d9d9d9"),
"FC>2" = c("TRUE" = "#e7298a", "FALSE"="#d9d9d9")
)
)
# heatmap
hm <- Heatmap(
plot_mt, name = "normalized\naccessibility",
col = col_fun,
cluster_rows = FALSE, cluster_columns = FALSE,
show_row_names = FALSE, show_column_names = TRUE,
row_title = "genes",
row_split = row_split,
cluster_row_slices = FALSE,
top_annotation = col_ann,
right_annotation = mark_ann,
left_annotation = module_ann,
border = TRUE
)
hm
Score archetype motifs in consensus peaks
require(universalmotif)
require(monalisa)
# motifs
mots_lst <- readRDS(
file.path("annotation", "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms.rds")
)
mona_lst <- mta_convert_umot_to_monalisa(mots_lst)
# peaks
peaks <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(peaks, c("seqnames", "start", "end", "name", "score", "strand"))
peaks_gr <- makeGRangesFromDataFrame(peaks, keep.extra.columns = TRUE)
# genome
genome <- Biostrings::readDNAStringSet("genome/Nvec_vc1.1_gDNA.fasta")
seqdt <- fread("genome/Nvec_vc1.1_gDNA.fasta.fai")[,1:2]
seqlvl <- c(seqdt[[1]], "ENA|OW052000|OW052000.1")
# scanning
mta_scores_mona <- mta_gw_motif_score_monalisa(
motifs = mona_lst,
genome_object = genome,
index_object = seqdt,
bin_width = 250,
subsample_fraction = 0.10,
score_quantiles = c(0, 0.25, 0.5, 0.75, 0.95, 0.98, 0.99, 0.995, 0.999, 1.0),
score_quantile_thr = 0.95,
do_gw_scan = TRUE,
given_gr = peaks_gr,
nthreads = 16
)
saveRDS(
mta_scores_mona,
file.path(res_dir, "motif-scores-archetypes-mona.rds")
)
# map motifs back to peaks
mta_hits <- mta_scores_mona$gw_scan
mta_cent <- narrow(mta_hits, start = width(mta_hits)/2, width = 1)
mta_ovls <- findOverlaps(query = mta_hits, subject = peaks_gr)
mta_scor <- mta_hits[queryHits(mta_ovls)]
pro_scor <- peaks_gr[subjectHits(mta_ovls)]
mcols(mta_scor) <- cbind(mcols(mta_scor), mcols(pro_scor))
# add genes
assign <- fread(
file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv")
)[, .(peak, gene)]
mta_data <- as.data.table(mta_scor)
mta_data[, score := NULL]
setnames(mta_data, c("name", "motif_score"), c("peak", "score"))
mta_annt <- merge.data.table(
mta_data, assign,
by = "peak",
all.x = TRUE,
allow.cartesian = TRUE
)
mta_annt[is.na(gene), gene := ""]
setcolorder(mta_annt, colnames(mta_data))
# order
mta_annt[, peak := factor(peak, levels = peaks$name)]
setkey(mta_annt, peak)
mta_annt <- unique(mta_annt)
# save
fwrite(
mta_annt,
file.path(res_dir, "motif-scores-archetypes-mona-annot.tsv"),
sep = "\t"
)
Motif enrichment in groups defined before
# load peaks clustering to groups
pks <- fread(
file.path(res_dir, "consensusSeekeR-peaks-clusters.bed")
)
setnames(pks, c("V4", "V8"), c("peak", "group"))
pks[, peak := factor(peak, levels = unique(pks$peak))]
pks[, group := factor(
group,
levels = paste0("G", sort(as.integer(str_remove(unique(pks$group), "G"))))
)]
setkey(pks, group, peak)
mta_scores_mona <- readRDS(
file.path(res_dir, "motif-scores-archetypes-mona.rds")
)
sites_gr <- mta_scores_mona$gw_scan
sites_gr$name <- sites_gr$motif
enr_list <- lapply(as.character(levels(pks$group)), function(g) {
enr_df <- mta_motif_enrichment_test(
sites_object = sites_gr,
peaks_object = peaks_gr,
fg_list = peaks_gr[peaks_gr$name %in% pks[group == g]$peak]$name,
bg_list = peaks_gr[!peaks_gr$name %in% pks[group == g]$peak]$name,
thresholds_vector = NULL, # mta_scores_mona$score_quantiles[,"0.95"],
label = g,
nthreads = 2,
pval_adjust = "BH"
)
setDT(enr_df)
enr_df
})
enr_dt <- rbindlist(enr_list)
# save
fwrite(
enr_dt,
file.path(res_dir, "motif-enrich-archetypes-mona.tsv"),
sep = "\t"
)
Cluster significant motifs
# enrichment scores
dt <- fread(
file.path(res_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(dt, c("label", "motif"), c("group", "archetype_name"))
dt[, motif := str_extract(archetype_name, "ARCH\\d+")]
# motif to tf assignment
dc <- fread(file.path(
"annotation",
"assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]
# combine
dt <- merge.data.table(
dt, dc, by = "motif",
all.x = TRUE, sort = FALSE, allow.cartesian = TRUE
)
# qvalue
dat_qv <- dcast.data.table(
unique(dt[, .(motif, group, padj)]), motif ~ group, value.var = "padj"
)
mat_qv <- data.matrix(dat_qv)[, -1]
mat_qv[is.na(mat_qv)] <- 1
rownames(mat_qv) <- dat_qv$motif
write.table(
mat_qv,
file.path(
res_dir, "motif-enrich-archetypes-mona-qvalue.tsv"
),
sep = "\t", quote = FALSE
)
# log2fc
dat_fc <- dcast.data.table(
unique(dt[, .(motif, group, fc)]), motif ~ group, value.var = "fc"
)
mat_fc <- data.matrix(dat_fc)[, -1]
mat_fc[is.na(mat_fc)] <- 0
rownames(mat_fc) <- dat_fc$motif
write.table(
mat_fc,
file.path(res_dir, "motif-enrich-archetypes-mona-fc.tsv"),
sep = "\t", quote = FALSE
)
# highly significant higher value
dt[, minuslog10qval := -1 * log10(padj)]
# filtering
ids <- apply(mat_fc, 1, function(x) max(x) > 1) &
apply(mat_qv, 1, function(x) !is.infinite(min(x)) & !min(x) > 0.001)
# clustering
hc <- hclust(dist(mat_fc[ids, ]), method = "ward.D2")
ds <- dt[motif %in% rownames(mat_fc)[ids]]
ds[, motif := factor(motif, levels = rev(rownames(mat_fc)[ids]))]
# ordering
mord <- order(apply(mat_fc[ids,], 1, which.max))
ds[, motif := factor(motif, levels = rev(rownames(mat_fc[ids, ])[mord]))]
# limit -log10FDR range
ds[, minuslog10qval := pmin(minuslog10qval, 20)]
# golden marks annotations
marks_gold <- fread(
file.path("annotation", "golden-marks-231124.tsv"),
sep = "\t", header = FALSE, fill = TRUE
)[, c(2, 1)]
setnames(marks_gold, c("gene", "name"))
# tf annotations
tfs_annt <- fread(
file.path("annotation", "tfs.Nvec.tsv"),
header = TRUE
)[, .SD[1], gene]
# add annotations
marks_tfs <- merge.data.table(
tfs_annt,
marks_gold,
by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""
mt_marks_dt <- merge.data.table(
ds, marks_tfs,
by = "gene",
all.x = TRUE
)
mt_marks_dt[, motif := factor(motif, levels = levels(ds$motif))]
setorder(mt_marks_dt, motif)
mt_marks_dt[is.na(name), name := ""]
mt_marks_dt[name == "" & pfam != "", name := pfam]
mt_marks_dt[name == "" & og != "", name := og]
# add labels
mt_marks_dt[, label := name]
mt_marks_dt[label == "", label := archetype_name]
mt_marks_dt[!grepl("^ARCH", label), label := paste(motif, label)]
mt_marks_dt[, label := substr(label, 1, 35)]
mt_marks_dt[, label := factor(label, levels = unique(mt_marks_dt$label))]
All significant motifs enrichment heatmap
# plot
gp <- ggplot(mt_marks_dt, aes(group, label, label = label)) +
geom_point(
aes(size = minuslog10qval, fill = fc),
shape = 21,
color = "black"
) +
scale_fill_gradientn(
name = "enrichmnet\nfold change",
# breaks = c(0, 1, 2, 4, 6),
colours = c(
"white", "#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#a50f15", "#7a0105"
)
) +
scale_size_continuous(
name = "-log10 FDR",
breaks = c(0, 10, 20)
) +
theme(
panel.grid.major = element_line(size = 0.25),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
legend.direction = "vertical"
)
Significant motifs assigned to TFs enrichment heatmap
mt_marks_tf <- mt_marks_dt[!is.na(gene)]
mt_marks_tf[, label := paste(
substr(name, 1, 45), gene, motif, sep = " | "
)]
mt_marks_tf[, label := factor(label, levels = unique(mt_marks_tf$label))]
# plot
gp <- ggplot(mt_marks_tf, aes(group, label, label = label)) +
geom_point(
aes(size = minuslog10qval, fill = fc),
shape = 21,
color = "black"
) +
scale_fill_gradientn(
name = "enrichmnet\nfold change",
# breaks = c(0, 1, 2, 4, 6),
colours = c(
"white", "#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#a50f15", "#7a0105"
)
) +
scale_size_continuous(
name = "-log10 FDR",
breaks = c(0, 10, 20)
) +
theme(
panel.grid.major = element_line(size = 0.25),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
legend.direction = "vertical"
)
gp
How many expressed TFs have motifs?
# TF annotation
tfs_annt <- fread(
file.path("annotation", "tfs.Nvec.tsv"),
header = TRUE
)[, .SD[1], gene]
# groupped genes
mark_dt <- fread(file.path(
"RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"
))
mark_dt[, group := paste0("G", group)]
gene_dt <- fread(file.path(
"RNASEQ_QUANTIFICATION", "results", "genes_clusters_predicted.tsv"
))
gene_dt[, group := group_xgb]
group_dt <- rbindlist(list(
clustered = mark_dt[, .(gene, group)],
predicted = gene_dt[, .(gene, group)]
), idcol = "method")
group_dt[, group := factor(
group,
levels = paste0("G", unique(
sort(as.integer(str_remove(group_dt$group, "G")))
))
)]
message(sprintf(
"Total genes: %i", length(group_dt$gene)
))
## Total genes: 20535
# 20535
# expressed genes
con_mt <- read.table(
file.path("RNASEQ_QUANTIFICATION", "raw_counts_rnaseq.tsv"),
header=TRUE, row.names = 1
)
expr_genes <- names(which(apply(con_mt, 1, function(x) sum(x) > 100) == TRUE))
message(sprintf(
"Expressed genes: %i", length(expr_genes)
))
## Expressed genes: 19288
# 19288
# expressed TFs
expr_tfs <- expr_genes[expr_genes %in% tfs_annt$gene]
message(sprintf(
"Expressed TFs: %i", length(expr_tfs)
))
## Expressed TFs: 740
# 740
# motif to tf assignment
dc <- fread(file.path(
"annotation",
"assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
# expressed TFs with motif
expr_dt <- group_dt[gene %in% expr_genes]
expr_dt[, TF := gene %in% expr_tfs]
expr_dt[, TF := factor(TF, levels = c(TRUE, FALSE))]
setorder(expr_dt, group, TF)
expr_dt <- merge.data.table(
expr_dt, dc, by = "gene", all.x = TRUE, sort = FALSE
)
expr_tfs_mot <- unique(expr_dt[TF==TRUE & !is.na(archetype_name)]$gene)
message(sprintf(
"Expressed TFs with motif assigned: %i/%i (%.2f%%)",
length(expr_tfs_mot), length(expr_tfs),
length(expr_tfs_mot) / length(expr_tfs) * 100
))
## Expressed TFs with motif assigned: 589/740 (79.59%)
# 589/740 (79.59%)
Merge replicates per condition
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
nth=18
for line in Fox Elav Ncol
do
for cond in pos neg
do
name=${line}_${cond}
echo ${name}
bams=$( echo ${bam_dir}/${name}*bam )
samtools merge -@ ${nth} ${bam_dir}/${name}.ncfree.bam ${bams}
samtools sort -@ ${nth} -o ${bam_dir}/${name}.ncfree.sorted.bam ${bam_dir}/${name}.ncfree.bam
rm ${bam_dir}/${name}.ncfree.bam
samtools index -@ ${nth} ${bam_dir}/${name}.ncfree.sorted.bam
done
done
Footprint score calculation for consensus peaks
conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
out_dir="ATACSEQ/nucleosome_free_regions/footprint/"
mkdir ${out_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
genome="genome/Nvec_vc1.1_gDNA.fasta"
nth=12
for line in Fox Elav Ncol
do
for cond in pos neg
do
name=${line}_${cond}
echo $(date) "- Starting ATACorrect for" ${name}
TOBIAS ATACorrect \
--bam ${bam_dir}/${name}.ncfree.sorted.bam \
--genome ${genome} \
--peaks ${peaks} \
--prefix ${name} \
--outdir ${out_dir}/ATACorrect \
--cores ${nth}
echo $(date) "- Starting FootprintScores"
TOBIAS FootprintScores \
--signal ${out_dir}/ATACorrect/${name}_corrected.bw \
--regions ${peaks} \
--fp-min 10 --fp-max 50 \
--output ${out_dir}/${name}_footprints.bw \
--cores ${nth}
done
done
Plot difference in footprints
conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/ATACorrect"
plt_dir="ATACSEQ/nucleosome_free_regions/footprint/plots"
mkdir ${plt_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
for line in Fox Elav Ncol
do
TOBIAS PlotAggregate --TFBS ${peaks} \
--signals ${bwg_dir}/${line}_pos_corrected.bw ${bwg_dir}/${line}_neg_corrected.bw \
--output ${plt_dir}/${line}_footprint_comparison_all_peaks.pdf \
--flank 125 \
--share_y both \
--plot_boundaries \
--signal-on-x
done
TOBIAS PlotAggregate --TFBS ${peaks} \
--signals ${bwg_dir}/Elav_pos_corrected.bw ${bwg_dir}/Fox_pos_corrected.bw ${bwg_dir}/Ncol_pos_corrected.bw \
--output ${plt_dir}/pos_footprint_comparison_all_peaks.pdf \
--flank 150 \
--share_y both \
--plot_boundaries \
--signal-on-x
Combine motif scores and footprint scores to determine motif binding.
mamba activate tobias
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/"
out_dir="ATACSEQ/nucleosome_free_regions/footprint/BINDetect"
mkdir ${out_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
motifs="annotation/motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms.meme"
genome="genome/Nvec_vc1.1_gDNA.fasta"
nth=12
# have to split motifs in multiple files because of the open file limit
#for x in {1..4}
#do
# motifs="annotation/motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms-binned/motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms-bin"${x}".meme"
# alternatively, motifs for TFs only
x="tfs"
motifs="annotation/motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-pwms-tfs.meme"
line1=Fox
line2=Elav
line3=Ncol
mkdir ${out_dir}/${x}
echo $(date) "- Starting BINDetect for" ${x}
TOBIAS BINDetect \
--motifs ${motifs} \
--signals \
${bwg_dir}/${line1}_pos_footprints.bw \
${bwg_dir}/${line1}_neg_footprints.bw \
${bwg_dir}/${line2}_pos_footprints.bw \
${bwg_dir}/${line2}_neg_footprints.bw \
${bwg_dir}/${line3}_pos_footprints.bw \
${bwg_dir}/${line3}_neg_footprints.bw \
--genome ${genome} \
--peaks ${peaks} \
--outdir ${out_dir}/${x} \
--cond_names \
${line1}_pos ${line1}_neg \
${line2}_pos ${line2}_neg \
${line3}_pos ${line3}_neg \
--cores ${nth}
#done
Combine motif binding scores
bind_dir <- file.path(dat_dir, "footprint", "BINDetect")
bind_dt <- rbindlist(lapply(c("tfs"), function(x) {
fn <- list.files(
file.path(bind_dir, x),
pattern = "bindetect_results.txt", full.names = TRUE
)
dt <- fread(fn)
dt[, name := NULL]
dt[, output_prefix := NULL]
dt[, cluster := NULL]
dt
}))
fwrite(bind_dt, file.path(
bind_dir, "bindetect_results.txt"
), sep = "\t")
Volcano plots
# bindetect results
bind_dt <- fread(file.path(
bind_dir, "bindetect_results.txt"
))
# motif to tf assignment
dc <- fread(file.path(
"annotation",
"assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]
dc_tf <- merge.data.table(
dc, tfs_annt, by = "gene", all.x = TRUE
)
dc_tf[gene %in% names(marks_gold_v), name := marks_gold_v[gene]]
# coparisons
comparas <- list(
c("Fox_pos", "Fox_neg"),
c("Elav_pos", "Elav_neg"),
c("Ncol_pos", "Ncol_neg"),
c("Fox_pos", "Elav_pos"),
c("Fox_pos", "Ncol_pos"),
c("Elav_pos", "Ncol_pos")
)
# volcano plots
gpv_list <- lapply(comparas, function(x) {
cols <- c(
"motif_id",
sprintf("%s_%s_change", x[[1]], x[[2]]),
sprintf("%s_%s_pvalue", x[[1]], x[[2]])
)
dt <- unique(bind_dt[, ..cols])
setnames(dt, c("motif_name", "log2FoldChange", "pvalue"))
dt[, motif := str_extract(motif_name, "ARCH\\d+")]
dt <- merge.data.table(dt, dc_tf, by = "motif", all.x = TRUE)
dt[, motif_label := name]
dt[motif_label == "", name := pfam]
dt[, motif_label := strtrim(motif_label, 45)]
dt[, minuslog10pval := -1 * log10(pvalue)]
dt[, minuslog10padj := NA]
ggplotVolcano(
dt,
padj_thr = NULL, pval_thr = 0.05,
lfc_thr = 0.2,
sign_col = c("red", "blue"),
lims_fc = c(-0.5, 0.5),
lims_sig = c(0, 150),
title = sprintf("%s vs %s", x[[1]], x[[2]]),
xlab = "differential binding score",
y = "-log10(p value)",
label_column = "motif_label", label_size = 1, label_segment_alpha = 0.2,
label_fc_thr = 0.2
) +
theme(panel.grid.major = element_line(color = "gray", linewidth = 0.2))
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
gpv <- (wrap_plots(gpv_list, nrow = 2) & theme(legend.position = "bottom")) +
plot_layout(guides = "collect")
gpv
## Warning: Removed 28 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 10 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 2 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 17 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 71 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 7 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 8 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 22 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 1 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 38 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 24 rows containing missing values (`geom_text_repel()`).
Save parsed results
# parse data
bind_dt <- rbindlist(lapply(comparas, function(x) {
cols <- c(
"motif_id",
sprintf("%s_%s_change", x[[1]], x[[2]]),
sprintf("%s_%s_pvalue", x[[1]], x[[2]])
)
dt <- unique(bind_dt[, ..cols])
setnames(dt, c("motif_name", "log2FoldChange", "pvalue"))
dt[, motif := str_extract(motif_name, "ARCH\\d+")]
dt <- merge.data.table(dt, dc_tf, by = "motif", all.x = TRUE)
dt[, compara := sprintf("%s_%s", x[[1]], x[[2]])]
dt
}))
fwrite(bind_dt, file.path(
bind_dir, "bindetect_results_parsed.txt"
), sep = "\t")
fwrite(bind_dt[pvalue < 0.05], file.path(
bind_dir, "bindetect_results_significant.txt"
), sep = "\t")
Parse all binding scores per sample (from all comparisons)
# load diff motif binding results
hm_dt <- fread(file.path(bind_dir, "bindetect_results_parsed.txt"))
#hm_dt <- fread(file.path(bind_dir, "bindetect_results_significant.txt"))
# get motifs from all comparisons per line
diff_mots_dt <- rbindlist(sapply(c("Elav", "Fox", "Ncol"), function(smp) {
# select motifs from comparison with neg control
dt_crt <- hm_dt[compara == sprintf("%s_pos_%s_neg", smp, smp)]
# select motifs from comparison with other lines
if (smp == "Elav") {
dt_dif <- rbindlist(list(
"neg" = dt_crt,
"Fox" = hm_dt[compara == "Fox_pos_Elav_pos"][
, log2FoldChange := -1 * log2FoldChange],
"Ncol" = hm_dt[compara == "Elav_pos_Ncol_pos"]
), idcol = "vs")
} else if (smp == "Fox") {
dt_dif <- rbindlist(list(
"neg" = dt_crt,
"Elav" = hm_dt[compara == "Fox_pos_Elav_pos"],
"Ncol" = hm_dt[compara == "Fox_pos_Ncol_pos"]
), idcol = "vs")
} else if (smp == "Ncol") {
dt_dif <- rbindlist(list(
"neg" = dt_crt,
"Fox" = hm_dt[compara == "Fox_pos_Ncol_pos"][
, log2FoldChange := -1 * log2FoldChange],
"Elav" = hm_dt[compara == "Elav_pos_Ncol_pos"][
, log2FoldChange := -1 * log2FoldChange]
), idcol = "vs")
}
dt_dif[, compara := NULL]
setcolorder(dt_dif, c("vs", "motif", "log2FoldChange", "pvalue"))
dt_dif
}, USE.NAMES = TRUE, simplify = FALSE), idcol = "reporterline")
fwrite(
diff_mots_dt,
file.path(bind_dir, "bindetect_results_reporterline.txt"),
sep = "\t"
)
#fwrite(
# diff_mots_dt,
# file.path(bind_dir, "bindetect_results_significant_reporterline.txt"),
# sep = "\t"
#)
Plot dotplot
# BINDetect results
diff_mots_dt <- fread(file.path(
bind_dir, "bindetect_results_reporterline.txt"
))
# significant
pval_thr <- 0.05
fc_thr <- 0.2
siginifcant_mots <- diff_mots_dt[
pvalue < pval_thr & abs(log2FoldChange) > fc_thr
]$motif
dp_dt <- diff_mots_dt[motif %in% siginifcant_mots]
dp_dt[, id := paste(reporterline, vs, sep = " vs ")]
# order samples
dp_dt[, reporterline := factor(reporterline, levels = c("Elav", "Fox", "Ncol"))]
dp_dt[, vs := factor(vs, levels = c("neg", "Elav", "Fox", "Ncol"))]
lv_dt <- CJ(
levels(dp_dt$reporterline),
levels(dp_dt$vs)
)[V1 != V2][, id := paste(V1, V2, sep = " vs ")]
lv_dt[, V1 := factor(V1, levels = c("Elav", "Fox", "Ncol"))]
lv_dt[, V2 := factor(V2, levels = c("neg", "Elav", "Fox", "Ncol"))]
setorder(lv_dt, V1, V2)
dp_dt[, id := factor(id, levels = lv_dt$id)]
dp_dt[, motif_id := paste(motif, gene, sep = "__")]
# order motifs
gpv_dt <- dcast.data.table(
dp_dt, motif_id ~ id, value.var = "log2FoldChange"
)
gpv_mt <- as.matrix(gpv_dt[, -1])
rownames(gpv_mt) <- unique(dp_dt$motif_id)
mord <- rev(order(apply(gpv_mt, 1, which.max)))
mord <- hclust(dist(gpv_mt), method = "ward.D2")$order
dp_dt[, motif_id := factor(motif_id, levels = rownames(gpv_mt)[mord])]
setorder(dp_dt, id, motif_id)
# labels
dp_dt[, motif_label := paste(substr(name, 1, 35), gene, motif, sep = " | ")]
dp_dt[, motif_label := factor(motif_label, levels = unique(dp_dt$motif_label))]
# plot
dp_dt[, minuslog10pvalue := -1 * log10(pvalue)]
gp_dot <- ggplot(dp_dt, aes(
id, motif_label,
size = minuslog10pvalue, fill = log2FoldChange
)) +
geom_point(shape = 21) +
scale_fill_gradientn(
colours = colorRampPalette(RColorBrewer::brewer.pal(11, "BrBG"))(1000),
limits = c(-0.35, 0.35), oob = scales::squish, breaks = c(-0.3, 0, 0.3),
name = "log2(fold change)",
) +
scale_size_continuous(
name = "-log10(q value)",
range = c(1, 7),
breaks = c(10, 50, 100, 150)
) +
theme(
panel.grid.major = element_line(size = 0.25),
axis.text.y = element_text(size = 10),
axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust = 1),
axis.title.x = element_blank(),
legend.direction = "vertical"
) +
facet_grid(. ~ reporterline, scales = "free")
Parse bound files for all motifs
bound_dt <- rbindlist(lapply("tfs", function(x) {
message(x)
mot_dirs <- list.dirs(file.path(bind_dir, x), recursive = FALSE)
mot_dirs <- grep("ARCH", mot_dirs, value = TRUE)
mot <- str_remove(basename(mot_dirs), "^_")
names(mot_dirs) <- mot
rbindlist(sapply(mot, function(m) {
message(m)
fns <- list.files(
file.path(mot_dirs[m], "beds"),
pattern = "_bound.bed",
full.names = TRUE
)
names(fns) <- str_extract(basename(fns), "(Elav|Fox|Ncol)_(pos|neg)")
rbindlist(sapply(names(fns), function(fn) {
fread(fns[fn])
}, USE.NAMES = TRUE, simplify = FALSE), idcol = "sample")
}, simplify = FALSE, USE.NAMES = TRUE))
}))
bound_dt[, V11 := V13][, V13 := NULL]
bed_cols <- c(
"seqnames", "start", "end", "motif_name", "score", "strand",
"seqnames_peak", "start_peak", "end_peak",
"peak", "peak_score", "peak_strand"
)
setnames(bound_dt, c("sample", bed_cols))
bound_dt[, motif_name := str_remove(motif_name, "^_")]
# add motif-to-gene assignments
bound_dt[, motif := str_extract(motif_name, "ARCH\\d+")]
bound_dt <- merge.data.table(
bound_dt, dc_tf, by = "motif",
all.x = TRUE, sort = FALSE, allow.cartesian = TRUE
)
# save
fwrite(
bound_dt,
file.path(bind_dir, "motifs_bound.tsv.gz"),
sep = "\t"
)
Subset targets for expressed genes.
# load motif binding in peaks
bound_dt <- fread(file.path(bind_dir, "motifs_bound.tsv.gz"))
# order samples
bound_dt[, sample := factor(sample, levels = c(
"Elav_pos", "Elav_neg",
"Fox_pos", "Fox_neg",
"Ncol_pos", "Ncol_neg"
))]
# add peak to target gene assignment
assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
assign <- unique(assign[, .(gene, peak)])
setnames(assign, "gene", "target_gene")
bound_dt <- merge.data.table(
bound_dt, assign, by = "peak",
all.x = TRUE, sort = FALSE, allow.cartesian = TRUE
)
bound_dt <- bound_dt[!is.na(target_gene)]
# parse only motif + gene + target gene + sample info, for pos samples
mo_dt <- bound_dt[order(score)][, .SD[1], .(motif, gene, target_gene, sample)][grep("pos", sample)]
mo_dt <- unique(mo_dt[, .(
motif, motif_name, gene, name, pfam, target_gene, sample
)])
mo_dt[, n_samples := length(unique(.SD$sample)), .(motif, gene, target_gene)]
# add target gene annotation
target_gene_ann <- copy(gene_ann)
setnames(
target_gene_ann,
colnames(target_gene_ann),
paste0("target_", colnames(target_gene_ann))
)
mo_dt <- merge.data.table(
mo_dt, target_gene_ann, by = "target_gene",
all.x = TRUE, sort = FALSE
)
mo_dt[is.na(target_pfam), target_pfam := ""]
mo_dt[is.na(target_name), target_name := ""]
# gene expression tpm
tpm_mt <- read.table(
file.path("RNASEQ_QUANTIFICATION", "raw_counts_rnaseq.tsv"),
header = TRUE, row.names = 1
)
# aggregate replicates
smpl <- paste(
str_extract(colnames(tpm_mt), "Fox|Elav|Ncol"),
str_to_lower(str_extract(colnames(tpm_mt), "Pos|Neg")),
sep = "_"
)
smpl_mt <- tapply(colnames(tpm_mt), smpl, function(y) {
apply(tpm_mt[, y], 1, mean)
}, simplify = FALSE)
smpl_mt <- do.call("cbind", smpl_mt)
# expressed genes
tpm_genes_dt <- rbindlist(apply(smpl_mt, 2, function(x) {
data.table(expression_tpm = x)[, gene := names(x)]
}, simplify = FALSE), idcol = "sample")
mo_dt <- merge.data.table(
mo_dt, tpm_genes_dt,
by = c("gene", "sample"),
all.x = TRUE, sort = FALSE
)
mo_dt[, expressed := FALSE][expression_tpm > 100, expressed := TRUE]
unique(
mo_dt[, .(sample, gene, expressed)]
)[, .N, .(sample, expressed)][order(sample, expressed)]
## sample expressed N
## 1: Elav_pos FALSE 119
## 2: Elav_pos TRUE 479
## 3: Fox_pos FALSE 123
## 4: Fox_pos TRUE 475
## 5: Ncol_pos FALSE 138
## 6: Ncol_pos TRUE 460
# expressed target genes
tmp_tgenes_dt <- copy(tpm_genes_dt)
setnames(tmp_tgenes_dt, c("gene", "expression_tpm"), c("target_gene", "target_expression_tpm"))
mo_dt <- merge.data.table(
mo_dt, tmp_tgenes_dt,
by = c("target_gene", "sample"),
all.x = TRUE, sort = FALSE
)
mo_dt[, target_expressed := FALSE][target_expression_tpm > 100, target_expressed := TRUE]
unique(
mo_dt[, .(sample, target_gene, target_expressed)]
)[, .N, .(sample, target_expressed)][order(sample, target_expressed)]
## sample target_expressed N
## 1: Elav_pos FALSE 682
## 2: Elav_pos TRUE 7994
## 3: Fox_pos FALSE 786
## 4: Fox_pos TRUE 7921
## 5: Ncol_pos FALSE 733
## 6: Ncol_pos TRUE 7778
# gene expression lfc
lfc_mt <- read.table(
file.path("RNASEQ_QUANTIFICATION", "results", "log2fc_expression.tsv"),
sep = "\t",
header = TRUE
)
lfc_mt <- lfc_mt[, grep("vs_neg", colnames(lfc_mt))]
lfc_dt <- melt.data.table(
as.data.table(lfc_mt, keep.rownames = "gene"),
id.vars = "gene",
variable.name = "sample",
value.name = "expression_lfc"
)
lfc_dt[, sample := str_replace_all(sample, "_vs_neg", "_pos")]
mo_dt <- merge.data.table(
mo_dt, lfc_dt, by = c("sample", "gene"),
all.x = TRUE, sort = FALSE
)
# target expression lfc
lfc_tg <- copy(lfc_dt)
setnames(lfc_tg, c("gene", "expression_lfc"), c("target_gene", "target_expression_lfc"))
mo_dt <- merge.data.table(
mo_dt, lfc_tg, by = c("sample", "target_gene"),
all.x = TRUE, sort = FALSE
)
# save
mo_dt <- mo_dt[!is.na(target_gene)]
setcolorder(mo_dt, c(
"sample", "gene", "target_gene", "name", "pfam", "target_name", "target_pfam",
"expression_tpm", "target_expression_tpm",
"expression_lfc", "target_expression_lfc",
"expressed", "target_expressed",
"motif", "motif_name",
"n_samples"
))
fwrite(
mo_dt,
file.path(res_dir, "motifs_genes_targets_long.tsv.gz"),
sep = "\t"
)
How many target genes for each TF?
hits_dt <- unique(mo_dt[expressed == TRUE][, .(sample, motif, gene, target_gene)])
hits_dt <- hits_dt[, .N, .(motif, sample)]
hits_gp <- ggplot(hits_dt, aes(sample, N, fill = sample)) +
geom_boxplot() +
scale_y_continuous(trans = "log10") +
scale_fill_manual(values = line_cond_cols) +
labs(y = "number of target genes per TF") +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "none",
panel.grid.major.y = element_line(linewidth = 0.5),
panel.grid.minor.y = element_line(linewidth = 0.2)
)
hits_gp